# TODO: can't use Boston College in example of "counterfactual high prestige"
library(tidyverse)
## ── Attaching packages ──────────────────────────────────────────────────────────────────────────────── tidyverse 1.3.1 ──
## ✓ ggplot2 3.3.5 ✓ purrr 0.3.4
## ✓ tibble 3.1.4 ✓ dplyr 1.0.7
## ✓ tidyr 1.1.3 ✓ stringr 1.4.0
## ✓ readr 2.0.1 ✓ forcats 0.5.1
## ── Conflicts ─────────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(igraph)
##
## Attaching package: 'igraph'
## The following objects are masked from 'package:dplyr':
##
## as_data_frame, groups, union
## The following objects are masked from 'package:purrr':
##
## compose, simplify
## The following object is masked from 'package:tidyr':
##
## crossing
## The following object is masked from 'package:tibble':
##
## as_data_frame
## The following objects are masked from 'package:stats':
##
## decompose, spectrum
## The following object is masked from 'package:base':
##
## union
library(tidygraph)
##
## Attaching package: 'tidygraph'
## The following object is masked from 'package:igraph':
##
## groups
## The following object is masked from 'package:stats':
##
## filter
library(ggraph)
# library(smglr)
library(broom)
library(ggforce)
library(ggbeeswarm)
library(assertthat)
##
## Attaching package: 'assertthat'
## The following object is masked from 'package:tibble':
##
## has_name
library(tictoc)
data_folder = '../data/'
plots_path = '../output/03_'
paper_folder = '../paper/'
## Load data ----
load(str_c(data_folder, '02_parsed.Rdata'))
nrow(individual_df)
## [1] 3448
individual_df %>%
filter(permanent) %>%
nrow()
## [1] 1809
individual_df %>%
filter(permanent) %>%
nrow() %>%
{./nrow(individual_df)}
## [1] 0.524652
There are 203 PhD programs producing graduate students in the data
univ_df %>%
filter(total_placements > 0) %>%
nrow()
## [1] 220
167 PhD programs with permanent placements
individual_df %>%
filter(permanent) %>%
count(placing_univ) %>%
nrow()
## [1] 179
univ_df %>%
filter(perm_placements > 0) %>%
nrow()
## [1] 180
Finding: 37 PhD programs (37/203 = 18%) produce about 50% of permanent placements
ggplot(univ_df, aes(perm_placement_rank, frac_cum_perm_placements)) +
geom_step() +
scale_x_continuous(labels = scales::percent_format(),
name = 'PhD Programs') +
scale_y_continuous(labels = scales::percent_format(),
name = 'Permanent Placements')
## Warning: Removed 25 row(s) containing missing values (geom_path).
univ_df %>%
filter(frac_cum_perm_placements <= .5) %>%
arrange(perm_placement_rank) %>%
mutate(perm_placement_rank = row_number()) %>%
select(perm_placement_rank, univ_name,
perm_placements, frac_cum_perm_placements) %>%
knitr::kable()
| perm_placement_rank | univ_name | perm_placements | frac_cum_perm_placements |
|---|---|---|---|
| 1 | University of Oxford | 42 | 0.0232172 |
| 2 | Graduate Center of the City University of New York | 40 | 0.0453289 |
| 3 | University of Notre Dame | 37 | 0.0657822 |
| 4 | Princeton University | 35 | 0.0851299 |
| 5 | The Catholic University of America* | 30 | 0.1017137 |
| 6 | University of Toronto | 29 | 0.1177446 |
| 7 | Stanford University | 28 | 0.1332228 |
| 8 | University of Southern California | 27 | 0.1481481 |
| 9 | University of Michigan | 26 | 0.1625207 |
| 10 | Yale University | 26 | 0.1768933 |
| 11 | Rutgers University | 26 | 0.1912659 |
| 12 | Columbia University | 25 | 0.2050857 |
| 13 | University of North Carolina at Chapel Hill | 25 | 0.2189055 |
| 14 | University of Chicago | 25 | 0.2327253 |
| 15 | Massachusetts Institute of Technology | 24 | 0.2459923 |
| 16 | Pennsylvania State University | 23 | 0.2587065 |
| 17 | University of California, Berkeley | 23 | 0.2714207 |
| 18 | New York University | 23 | 0.2841349 |
| 19 | Katholieke Universiteit Leuven | 23 | 0.2968491 |
| 20 | Emory University | 22 | 0.3090105 |
| 21 | Harvard University | 22 | 0.3211719 |
| 22 | University of California, Los Angeles | 21 | 0.3327805 |
| 23 | Vanderbilt University | 21 | 0.3443892 |
| 24 | University of Wisconsin-Madison | 21 | 0.3559978 |
| 25 | Duquesne University | 20 | 0.3670536 |
| 26 | Boston College | 20 | 0.3781095 |
| 27 | University of Arizona | 20 | 0.3891653 |
| 28 | The New School | 20 | 0.4002211 |
| 29 | Baylor University | 20 | 0.4112769 |
| 30 | Saint Louis University | 19 | 0.4217800 |
| 31 | University of South Florida | 19 | 0.4322830 |
| 32 | Cornell University | 19 | 0.4427861 |
| 33 | University of Virginia | 18 | 0.4527363 |
| 34 | Australian National University | 18 | 0.4626866 |
| 35 | Stony Brook University | 18 | 0.4726368 |
| 36 | Villanova University | 17 | 0.4820343 |
| 37 | DePaul University | 17 | 0.4914317 |
## Placements at PhD-producing programs
grad_programs = univ_df %>%
filter(total_placements > 0, country %in% c('U.S.')) %>%
pull(univ_id)
# gp2 = individual_df %>%
# count(placing_univ_id) %>%
# pull(placing_univ_id)
individual_df %>%
filter(permanent, hiring_univ_id %in% grad_programs) %>%
filter(placing_univ_id %in% grad_programs) %>%
filter(position_type == 'Tenure-Track') %>%
count(placing_univ) %>%
arrange(desc(n)) %>%
rename(phd_program_placements = n) %>%
mutate(cum_phd_program_placements = cumsum(phd_program_placements),
share_phd_program_placements = cum_phd_program_placements / sum(phd_program_placements)) %>%
slice(1:20) %>%
knitr::kable(digits = 2)
| placing_univ | phd_program_placements | cum_phd_program_placements | share_phd_program_placements |
|---|---|---|---|
| Princeton University | 18 | 18 | 0.06 |
| University of California, Berkeley | 13 | 31 | 0.11 |
| Harvard University | 12 | 43 | 0.15 |
| Massachusetts Institute of Technology | 12 | 55 | 0.19 |
| New York University | 12 | 67 | 0.24 |
| Rutgers University | 12 | 79 | 0.28 |
| Stanford University | 10 | 89 | 0.31 |
| University of Chicago | 10 | 99 | 0.35 |
| Yale University | 10 | 109 | 0.39 |
| University of Arizona | 9 | 118 | 0.42 |
| University of California, Irvine (LPS) | 8 | 126 | 0.45 |
| Columbia University | 7 | 133 | 0.47 |
| Pennsylvania State University | 7 | 140 | 0.49 |
| University of California, Los Angeles | 7 | 147 | 0.52 |
| University of Michigan | 7 | 154 | 0.54 |
| University of North Carolina at Chapel Hill | 7 | 161 | 0.57 |
| University of Notre Dame | 7 | 168 | 0.59 |
| DePaul University | 6 | 174 | 0.61 |
| University of Pittsburgh | 6 | 180 | 0.64 |
| University of Pittsburgh (HPS) | 6 | 186 | 0.66 |
## build network ----
## NB Only permanent placements
hiring_network = individual_df %>%
filter(permanent) %>%
select(placing_univ_id, hiring_univ_id, everything()) %>%
graph_from_data_frame(directed = TRUE,
vertices = univ_df) %>%
as_tbl_graph() %>%
## Clean nodes (universities) that aren't in the network
mutate(degree = centrality_degree(mode = 'total')) %>%
filter(degree > 0)
hiring_network
## # A tbl_graph: 936 nodes and 1809 edges
## #
## # A directed multigraph with 17 components
## #
## # Node Data: 936 × 23 (active)
## name univ_name cluster_2 cluster_3 cluster_6 cluster_label city state
## <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
## 1 372 Adelphi … <NA> <NA> <NA> <NA> Gard… NY
## 2 117 Air Forc… <NA> <NA> <NA> <NA> U.S.… CO
## 3 10409 Al-Asmar… <NA> <NA> <NA> <NA> <NA> <NA>
## 4 992 Albany M… <NA> <NA> <NA> <NA> Alba… NY
## 5 353 Albany S… <NA> <NA> <NA> <NA> Alba… GA
## 6 764 Alberto … <NA> <NA> <NA> <NA> Sant… <NA>
## # … with 930 more rows, and 15 more variables: country <chr>,
## # total_placements <int>, perm_placements <int>, perm_placement_rate <dbl>,
## # frac_perm_placements <dbl>, cum_perm_placements <int>,
## # frac_cum_perm_placements <dbl>, perm_placement_rank <dbl>,
## # aos_diversity <dbl>, m_count <dbl>, w_count <dbl>, o_count <dbl>,
## # gender_na_count <dbl>, frac_w <dbl>, degree <dbl>
## #
## # Edge Data: 1,809 × 14
## from to person_id gender ethnicity race aos_category aos
## <int> <int> <int> <chr> <chr> <chr> <chr> <chr>
## 1 696 757 1 m 4 7 Science, Lo… Cogn…
## 2 209 910 2 m 2 5 History and… Medi…
## 3 693 727 3 m 4 7 LEMM Lang…
## # … with 1,806 more rows, and 6 more variables: graduation_year <int>,
## # placing_univ <chr>, placement_year <int>, hiring_univ <chr>,
## # position_type <chr>, permanent <lgl>
assert_that(length(E(hiring_network)) == {individual_df %>%
filter(permanent) %>%
nrow()},
msg = 'Number of edges in network ≠ number of perm. placements')
## [1] TRUE
## 1 giant component contains almost all of the schools
components(hiring_network)$csize
## [1] 902 2 5 2 2 2 2 1 3 2 2 2 2 2 2 2 1
Out- and in-centrality
to_reverse = function (graph) {
## Reverse edge direction
if (!is.directed(graph))
return(graph)
e <- get.data.frame(graph, what="edges")
## swap "from" & "to"
neworder <- 1:length(e)
neworder[1:2] <- c(2,1)
e <- e[neworder]
names(e) <- names(e)[neworder]
vertex_df = get.data.frame(graph, what = 'vertices')
if (ncol(vertex_df) > 0) {
return(as_tbl_graph(graph.data.frame(e, vertices = vertex_df)))
} else {
return(as_tbl_graph(graph.data.frame(e)))
}
}
set.seed(42)
nzmin = function(x, na.rm = TRUE) {
min(x[x > 0], na.rm = na.rm)
}
damping = .3
hiring_network = hiring_network %>%
mutate(in_centrality = centrality_eigen(weights = NA,
directed = TRUE),
in_pr = centrality_pagerank(weights = NA,
directed = TRUE,
damping = damping,
personalized = NULL)) %>%
morph(to_reverse) %>%
mutate(out_centrality = centrality_eigen(weights = NA,
directed = TRUE),
out_pr = centrality_pagerank(weights = NA,
directed = TRUE,
damping = damping,
personalized = NULL)) %>%
unmorph() %>%
## Trim 0s to minimum non-zero values
mutate(out_centrality = ifelse(out_centrality == 0,
nzmin(out_centrality),
out_centrality),
in_centrality = ifelse(in_centrality == 0,
nzmin(in_centrality),
in_centrality))
## Check relationship btwn unweighted multiedges and weighted edges
## Strong correlation, esp by approx rank and high/low prestige
## But some movement, both numerically and ordinally
# E(hiring_network)$weight = count_multiple(hiring_network)
# hiring_network_simp = simplify(hiring_network)
#
# set.seed(42)
# V(hiring_network_simp)$out_centrality = hiring_network_simp %>%
# graph.reverse() %>%
# eigen_centrality(directed = TRUE) %>%
# .$vector
#
# tibble(name = V(hiring_network)$univ_name,
# multi = V(hiring_network)$out_centrality,
# simp = V(hiring_network_simp)$out_centrality) %>%
# ggplot(aes(log10(simp), log10(multi))) +
# geom_point() +
# stat_function(fun = function (x) x)
## exploring centrality scores ----
PageRank centrality is almost entirely determined by degree So we use eigenvector centrality instead
ggplot(as_tibble(hiring_network), aes(degree, log10(out_pr))) +
geom_point(aes(text = univ_name)) +
geom_smooth(method = 'lm')
## Warning: Ignoring unknown aesthetics: text
## `geom_smooth()` using formula 'y ~ x'
There are two clear groups of centrality scores, with high scores (in the range of ~10^-12 to 1) and low scores (10^-15 and smaller).
##
## NB there seem to be (small?) differences in scores (at the low end?) across runs of the centrality algorithm
ggplot(as_tibble(hiring_network), aes(out_centrality)) +
# geom_density() +
geom_histogram(binwidth = 1, fill = 'white', color = 'black') +
geom_rug() +
scale_x_continuous(trans = 'log10',
name = 'Out centrality',
breaks = scales::trans_breaks("log10", function(x) 10^x),
labels = scales::trans_format("log10", scales::math_format(10^.x))) +
facet_zoom(x = out_centrality > 10^-12,
ylim = c(0, 20),
# ylim = c(0, .02),
show.area = FALSE,
shrink = TRUE,
zoom.size = .3,
horizontal = FALSE) +
theme_bw()
ggsave(str_c(plots_path, 'out_centrality.png'),
height = 3, width = 6)
ggsave(str_c(paper_folder, 'fig_out_density.png'),
height = 3, width = 6)
ggplot(as_tibble(hiring_network), aes(in_centrality)) +
geom_density() + geom_rug() +
scale_x_continuous(trans = 'log10')
ggplot(as_tibble(hiring_network),
aes(out_centrality, in_centrality,
color = cluster_label,
text = univ_name)) +
geom_jitter() +
scale_x_log10() + scale_y_log10()
plotly::ggplotly()
# ## Check stability of centrality calculation
# iterated_centrality = 1:500 %>%
# map(~ {hiring_network %>%
# graph.reverse() %>%
# eigen_centrality(directed = TRUE, weights = NA) %>%
# .$vector}) %>%
# transpose() %>%
# map(unlist) %>%
# map(~ tibble(out_centrality = .)) %>%
# bind_rows(.id = 'univ_id') %>%
# group_by(univ_id) %>%
# summarize(min = min(out_centrality),
# max = max(out_centrality),
# median = median(out_centrality))
#
# ## Lots of variation among low-prestige
# ggplot(iterated_centrality,
# aes(reorder(univ_id, median),
# median)) +
# geom_pointrange(aes(ymin = min, ymax = max)) +
# scale_y_log10()
#
# ## But extremely stable results among high-prestige
# iterated_centrality %>%
# filter(min > 10^-12) %>%
# ggplot(aes(reorder(univ_id, median),
# median)) +
# geom_pointrange(aes(ymin = min, ymax = max)) +
# scale_y_log10()
We completely rewire the network, preserving out-degree (number of new PhDs) and in-degree (number of permanent hires), but otherwise randomly matching job-seekers to positions. We then recalculate out-centrality. The plots below show the out-centrality distributions for each random rewiring, with the actual distribution in red. The distributions are always bimodal, indicating that the observed split into high- and low-centrality programs is due in part to the way PhD production and hiring are distributed across institutions. However, the high-centrality subset is never as small as in the actual hiring network. This indicates that bimodiality is not due only to the degree distributions. Some other factor is exacerbating the structural inequality in the hiring network.
set.seed(78910)
map(1:100,
~ sample_degseq(degree(hiring_network, mode = 'out'),
degree(hiring_network, mode = 'in'))
) %>%
map(to_reverse) %>%
map(eigen_centrality, directed = TRUE, weights = NULL) %>%
map(~ .$vector) %>%
map(log10) %>%
map(density) %>%
map(~ tibble(x = .$x, y = .$y)) %>%
bind_rows(.id = 'iteration') %>%
ggplot(aes(x, y)) +
geom_line(aes(group = iteration), alpha = .5) +
stat_density(data = as_tibble(hiring_network), geom = 'line',
aes(log10(out_centrality), ..density..),
color = 'red', size = 1)
Finding: High output is only modestly correlated w/ centrality. Programs like Leuven, New School, and Boston College produce lots of PhDs, but they aren’t placed into the high-centrality departments.
ggplot(as_tibble(hiring_network),
aes(total_placements, log10(out_centrality))) +
geom_point(aes(text = univ_name)) +
geom_smooth()
## Warning: Ignoring unknown aesthetics: text
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## Warning: Removed 735 rows containing non-finite values (stat_smooth).
## Warning: Removed 735 rows containing missing values (geom_point).
plotly::ggplotly()
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## Warning: Removed 735 rows containing non-finite values (stat_smooth).
# univ_df %>%
# mutate(out_centrality_log = log10(out_centrality)) %>%
# filter(!is.na(out_centrality_log) &
# out_centrality > 0 &
# !is.na(total_placements)) %>%
# select(total_placements, out_centrality_log) %>%
# cor()
hiring_network %>%
select(total_placements, out_centrality) %>%
as_tibble() %>%
filter(complete.cases(.)) %>%
mutate(out_centrality = log10(out_centrality)) %>%
cor()
## total_placements out_centrality
## total_placements 1.0000000 0.5480276
## out_centrality 0.5480276 1.0000000
ggplot(as_tibble(hiring_network),
aes(perm_placement_rank, log10(out_centrality))) +
geom_point() +
geom_smooth()
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## Warning: Removed 735 rows containing non-finite values (stat_smooth).
## Warning: Removed 735 rows containing missing values (geom_point).
ggplot(as_tibble(hiring_network),
aes(log10(out_centrality), perm_placement_rate)) +
geom_point(aes(text = univ_name)) +
geom_smooth(method = 'lm')
## Warning: Ignoring unknown aesthetics: text
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 735 rows containing non-finite values (stat_smooth).
## Warning: Removed 735 rows containing missing values (geom_point).
plotly::ggplotly()
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 735 rows containing non-finite values (stat_smooth).
as_tibble(hiring_network) %>%
select(total_placements, perm_placement_rate,
out_centrality) %>%
filter(complete.cases(.)) %>%
mutate(out_centrality = log10(out_centrality)) %>%
cor()
## total_placements perm_placement_rate out_centrality
## total_placements 1.00000000 0.02747104 0.5480276
## perm_placement_rate 0.02747104 1.00000000 0.1713565
## out_centrality 0.54802764 0.17135653 1.0000000
## Add centrality scores to univ_df
univ_df = hiring_network %>%
as_tibble() %>%
rename(univ_id = name) %>%
left_join(univ_df, .)
## Joining, by = c("univ_id", "univ_name", "cluster_2", "cluster_3", "cluster_6", "cluster_label", "city", "state", "country", "total_placements", "perm_placements", "perm_placement_rate", "frac_perm_placements", "cum_perm_placements", "frac_cum_perm_placements", "perm_placement_rank", "aos_diversity", "m_count", "w_count", "o_count", "gender_na_count", "frac_w")
## Movement down the prestige hierarchy
individual_df %>%
filter(permanent) %>%
left_join(select(univ_df, univ_id, out_centrality),
by = c('placing_univ_id' = 'univ_id')) %>%
left_join(select(univ_df, univ_id, out_centrality),
by = c('hiring_univ_id' = 'univ_id')) %>%
# filter(out_centrality.y > 10^-12) %>%
select(person_id, aos_category,
placing = out_centrality.x,
hiring = out_centrality.y) %>%
gather(variable, value, placing, hiring) %>%
mutate(variable = fct_rev(variable)) %>%
ggplot(aes(variable, log10(value))) +
geom_point() +
geom_line(aes(group = person_id),
alpha = .25) +
xlab('university') +
ylab('centrality (log10)') +
theme_minimal()
## Warning: attributes are not identical across measure variables;
## they will be dropped
ggsave(str_c(plots_path, 'prestige_movement.png'),
width = 3, height = 4)
ggsave(str_c(paper_folder, 'fig_crossing.png'),
width = 3, height = 4)
## Diagonal line indicates where hiring program has the same centrality as the placing program.
## Most placements are below this line, indicating that the centrality measure captures the idea that people typically are hired by programs with lower status
## The few placements above the line indicate that, even when individuals are hired by programs with higher status, the increase is relatively minor: no more than 1 point on the log10 scale
individual_df %>%
filter(permanent) %>%
left_join(select(univ_df, univ_id, out_centrality),
by = c('placing_univ_id' = 'univ_id')) %>%
left_join(select(univ_df, univ_id, out_centrality),
by = c('hiring_univ_id' = 'univ_id')) %>%
filter(out_centrality.y > 10^-12) %>%
select(person_id,
placing = out_centrality.x,
hiring = out_centrality.y) %>%
ggplot(aes(log10(placing), log10(hiring))) +
geom_jitter() +
stat_function(fun = identity)
## community detection ----
## Extract giant component
hiring_network_gc = hiring_network %>%
components %>%
.$csize %>%
{which(. == max(.))} %>%
{components(hiring_network)$membership == .} %>%
which() %>%
induced_subgraph(hiring_network, .) %>%
as_tbl_graph()
walk_len = rep(2:100, 1)
## ~24 sec
tic()
comm_stats = walk_len %>%
map(~ cluster_walktrap(hiring_network_gc, steps = .x)) %>%
map(~ list(sizes = sizes(.x), length = length(.x))) %>%
map_dfr(~ tibble(H = -sum(.x$sizes/sum(.x$sizes) * log2(.x$sizes/sum(.x$sizes))),
n_comms = .x$length)) %>%
mutate(walk_len = walk_len,
delta_H = log2(n_comms) - H)
toc()
## 21.099 sec elapsed
ggplot(comm_stats, aes(walk_len, H)) +
geom_point() +
geom_smooth()
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
ggplot(comm_stats, aes(walk_len, n_comms)) +
geom_point() +
geom_smooth()
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
ggplot(comm_stats, aes(n_comms, H)) +
geom_text(aes(label = walk_len))
Select the walk length that minimizes both delta_H (flatter community distribution) and n_comms (fewer communities) using regression residuals
walk_length = lm(H ~ n_comms, data = comm_stats) %>%
augment(comm_stats) %>%
# ggplot(aes(walk_len, .resid)) +
# geom_text(aes(label = walk_len))
arrange(desc(.resid)) %>%
pull(walk_len) %>%
first()
walk_length
## [1] 5
communities = cluster_walktrap(hiring_network_gc, steps = walk_length)
V(hiring_network_gc)$community = membership(communities)
Analysis of community detection
## Summary of community sizes
hiring_network_gc %>%
as_tibble() %>%
count(community) %>%
pull(n) %>%
summary()
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.000 4.000 7.000 9.495 11.000 113.000
univ_df = univ_df %>%
left_join({hiring_network_gc %>%
as_tibble() %>%
select(univ_id = name, community) %>%
mutate(community = as.character(community))})
## Joining, by = "univ_id"
univ_df %>%
filter(!is.na(community)) %>%
count(community) %>%
ggplot(aes(n)) +
geom_bar(aes(text = n)) +
scale_x_continuous(name = 'Community size (# programs)')
## Warning: Ignoring unknown aesthetics: text
plotly::ggplotly()
cluster_vars = univ_df %>%
select(matches('cluster')) %>%
names()
univ_df %>%
filter(!is.na(community), !is.na('cluster_lvl4'))
## # A tibble: 902 × 28
## univ_id univ_name cluster_2 cluster_3 cluster_6 cluster_label city state
## <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
## 1 372 Adelphi Univ… <NA> <NA> <NA> <NA> Gard… NY
## 2 117 Air Force Ac… <NA> <NA> <NA> <NA> U.S.… CO
## 3 10409 Al-Asmarya U… <NA> <NA> <NA> <NA> <NA> <NA>
## 4 992 Albany Medic… <NA> <NA> <NA> <NA> Alba… NY
## 5 353 Albany State… <NA> <NA> <NA> <NA> Alba… GA
## 6 764 Alberto Hurt… <NA> <NA> <NA> <NA> Sant… <NA>
## 7 3393 Allen Univer… <NA> <NA> <NA> <NA> Colu… SC
## 8 3513 Amarillo Col… <NA> <NA> <NA> <NA> Amar… TX
## 9 385 American Uni… <NA> <NA> <NA> <NA> Wash… DC
## 10 1134 American Uni… <NA> <NA> <NA> <NA> <NA> NULL
## # … with 892 more rows, and 20 more variables: country <chr>,
## # total_placements <int>, perm_placements <int>, perm_placement_rate <dbl>,
## # frac_perm_placements <dbl>, cum_perm_placements <int>,
## # frac_cum_perm_placements <dbl>, perm_placement_rank <dbl>,
## # aos_diversity <dbl>, m_count <dbl>, w_count <dbl>, o_count <dbl>,
## # gender_na_count <dbl>, frac_w <dbl>, degree <dbl>, in_centrality <dbl>,
## # in_pr <dbl>, out_centrality <dbl>, out_pr <dbl>, community <chr>
Finding: There is no correlation between semantic clusters and topological communities.
univ_df %>%
filter(!is.na(community), !is.na(cluster_label)) %>%
select(community, cluster_label) %>%
table() %>%
chisq.test(simulate.p.value = TRUE)
##
## Pearson's Chi-squared test with simulated p-value (based on 2000
## replicates)
##
## data: .
## X-squared = 164.16, df = NA, p-value = 0.06047
univ_df %>%
filter(!is.na(community), !is.na(cluster_label)) %>%
count(community, cluster_label) %>%
rename(cluster_n = n) %>%
group_by(community) %>%
mutate(community_tot = sum(cluster_n),
cluster_frac = cluster_n / community_tot,
H = sum(cluster_frac * log2(cluster_frac))) %>%
ungroup() %>%
ggplot(aes(fct_reorder(community, community_tot, .desc = FALSE),
cluster_n, fill = cluster_label)) +
geom_col() +
coord_flip() +
xlab('Topological communities') +
ylab('# of schools in community') +
scale_fill_brewer(palette = 'Set1',
name = 'Semantic\nclusters')
## high-prestige universities ----
## Start w/ NYU, and work upstream
## Only need to go 11 or 12 steps to get closure
1:25 %>%
map(~ make_ego_graph(hiring_network, order = .x,
nodes = '34', mode = 'in')) %>%
flatten() %>%
map(~ length(V(.x))) %>%
tibble(order = 1:length(.),
size = unlist(.)) %>%
ggplot(aes(order, size)) + geom_point()
prestigious = make_ego_graph(hiring_network, order = 12,
nodes = '34',
mode = 'in') %>%
.[[1]] %>%
as_tbl_graph()
## How large is the high-prestige community?
## 56 programs; 7% of all programs in the network;
## 28% of programs with at least 1 placement in the dataset
length(V(prestigious))
## [1] 83
length(V(prestigious)) / length(V(hiring_network))
## [1] 0.08867521
length(V(prestigious)) / sum(univ_df$total_placements > 0, na.rm = TRUE)
## [1] 0.3772727
## What fraction of hires are within high-prestige?
## 13% of all permanent hires; 7% of all hires
length(E(prestigious)) / length(E(hiring_network))
## [1] 0.1796573
length(E(prestigious)) / nrow(individual_df)
## [1] 0.09425754
# layout_prestigious = layout_with_focus(prestigious,
# which(V(prestigious)$univ_name == 'New York University')) %>%
# `colnames<-`(c('x', 'y')) %>%
# as_tibble()
layout_prestigious = create_layout(prestigious, 'focus',
focus = which(V(prestigious)$univ_name == 'New York University'))
# png(file = '02_prestigious_net.png',
# width = 11, height = 11, units = 'in', res = 400)
# set.seed(24)
ggraph(layout_prestigious) +
geom_node_label(aes(label = univ_name,
size = log10(out_centrality),
fill = log10(out_centrality)),
color = 'white') +
geom_edge_parallel(arrow = arrow(length = unit(.01, 'npc')),
alpha = .25,
strength = 5) +
scale_size_continuous(range = c(.5, 3), guide = 'none') +
scale_fill_viridis(name = 'out centrality (log10)') +
coord_cartesian(xlim = c(-3.5, 5.5), clip = 'on') +
theme_graph() +
theme(legend.position = 'bottom',
plot.margin = margin(0, unit = 'cm'))
## Warning: Ignoring unknown parameters: strength
ggsave(str_c(plots_path, 'prestigious_net.png'),
width = 6, height = 4, dpi = 600, scale = 2)
ggsave(str_c(paper_folder, 'fig_prestigious_net.png'),
width = 6, height = 4, dpi = 600, scale = 2)
# dev.off()
## High-prestige = high centrality group
univ_df = univ_df %>%
mutate(prestige = ifelse(univ_id %in% V(prestigious)$name,
'high-prestige',
'low-prestige'))
V(hiring_network)$prestigious = V(hiring_network)$out_centrality > 1e-12
V(hiring_network_gc)$prestigious = V(hiring_network_gc)$out_centrality > 1e-12
ggplot(univ_df, aes(prestige, log10(out_centrality))) +
geom_jitter()
## Warning: Removed 304 rows containing missing values (geom_point).
## Output alphabetical list of high-prestige programs
high_prestige_tab = univ_df %>%
filter(prestige == 'high-prestige') %>%
select(univ_name, cluster = cluster_label,
perm_placement_rate,
country,
# out_centrality
) %>%
# arrange(desc(out_centrality)) %>% view()
arrange(univ_name) %>%
mutate(perm_placement_rate = scales::percent_format(accuracy = 2)(perm_placement_rate)) %>%
knitr::kable(col.names = c('Program', 'Cluster',
'Placement Rate',
'Country'),
format = 'latex',
longtable = TRUE,
booktabs = TRUE,
# table.envir = 'sidewaystable',
label = 'high.prestige',
caption = 'High-prestige programs, in alphabetical order. Placement rate refers to placements in permanent academic positions, out of all graduates.')
write_file(high_prestige_tab, path = str_c(plots_path, 'high_prestige.tex'))
## Warning: The `path` argument of `write_file()` is deprecated as of readr 1.4.0.
## Please use the `file` argument instead.
write_file(high_prestige_tab, path = str_c(paper_folder, 'tab_high_prestige.tex'))
## Prestige status and clusters
## High-prestige are spread throughout, but #5 is mostly low-prestige
univ_df %>%
filter(!is.na(cluster_label)) %>%
ggplot(aes(cluster_label, color = prestige)) +
geom_point(stat = 'count') +
geom_line(aes(group = prestige), stat = 'count')
## High-prestige are mostly in the largest community
univ_df %>%
filter(!is.na(community)) %>%
ggplot(aes(as.integer(community), fill = prestige)) +
geom_bar()
## What fraction of high-prestige graduates end up in high-prestige programs?
## 24% of those w/ permanent placements
individual_df %>%
filter(permanent) %>%
left_join(univ_df, by = c('placing_univ_id' = 'univ_id')) %>%
left_join(univ_df, by = c('hiring_univ_id' = 'univ_id')) %>%
select(placing = prestige.x, hiring = prestige.y) %>%
count(placing, hiring) %>%
group_by(placing) %>%
mutate(share = n / sum(n))
## # A tibble: 3 × 4
## # Groups: placing [2]
## placing hiring n share
## <chr> <chr> <int> <dbl>
## 1 high-prestige high-prestige 325 0.261
## 2 high-prestige low-prestige 920 0.739
## 3 low-prestige low-prestige 564 1
Finding: Median permanent placement rate for high-prestige programs is 22 points higher than for low-prestige programs. However, variation is also wide within each group;
This is also not yet controlling for graduation year, area, or demographics.
ggplot(univ_df, aes(prestige, perm_placement_rate,
label = univ_name)) +
# geom_sina(aes(size = total_placements),
# alpha = .5) +
geom_beeswarm(aes(size = total_placements),
priority = 'density',
cex = 2,
alpha = .5) +
geom_violin(color = 'red', draw_quantiles = .5,
scale = 'count',
fill = 'transparent') +
# geom_jitter(aes(size = total_placements)) +
scale_x_discrete(expand = expand_scale(add=c(0, 1))) +
scale_y_continuous(labels = scales::percent_format()) +
ylab('permanent placement rate') +
scale_size(name = 'total placements') +
theme_minimal()
## Warning: `expand_scale()` is deprecated; use `expansion()` instead.
## Warning: Removed 1020 rows containing non-finite values (stat_ydensity).
## Warning: Removed 1020 rows containing missing values (position_beeswarm).
ggsave(str_c(plots_path, 'prestige_placement.png'),
width = 8, height = 4)
## Warning: Removed 1020 rows containing non-finite values (stat_ydensity).
## Warning: Removed 1020 rows containing missing values (position_beeswarm).
ggsave(str_c(paper_folder, 'fig_placement.png'),
width = 8, height = 4)
## Warning: Removed 1020 rows containing non-finite values (stat_ydensity).
## Warning: Removed 1020 rows containing missing values (position_beeswarm).
plotly::ggplotly()
## Warning: Removed 1020 rows containing non-finite values (stat_ydensity).
## Warning: Removed 1020 rows containing missing values (position_beeswarm).
univ_df %>%
group_by(prestige) %>%
summarize_at(vars(perm_placement_rate),
funs(median, max, min), na.rm = TRUE)
## Warning: `funs()` was deprecated in dplyr 0.8.0.
## Please use a list of either functions or lambdas:
##
## # Simple named list:
## list(mean = mean, median = median)
##
## # Auto named with `tibble::lst()`:
## tibble::lst(mean, median)
##
## # Using lambdas
## list(~ mean(., trim = .2), ~ median(., na.rm = TRUE))
## # A tibble: 2 × 4
## prestige median max min
## <chr> <dbl> <dbl> <dbl>
## 1 high-prestige 0.579 1 0.188
## 2 low-prestige 0.385 1 0
## prestige stability ----
By randomly rewiring the network, we can test the stability of the prestige categories to data errors and the short time frame of our data. In each of 500 permutations, we randomly rewire 10% of the edges in the permanent hiring network, then calculate out-centralities on the rewired network. Using a threshold of 10^-12 for high-prestige status, we count the fraction of rewired networks in which each program is high-prestige.
## ~6 sec
tic()
set.seed(13579)
permutations = 1:500 %>%
## Rewire 10% of edges
map(~ {hiring_network %>%
rewire(keeping_degseq(loops = TRUE,
niter = floor(.05 * length(E(.)))))}) %>%
## Calculate out-centralities
map(~ {.x %>%
to_reverse() %>%
eigen_centrality(directed = TRUE, weights = NA) %>%
.$vector}) %>%
transpose() %>%
map(unlist) %>%
## Fraction where program is high-prestige
map(~ sum(. > 10^-12) / length(.)) %>%
map(~ tibble(frac_high_prestige = .)) %>%
bind_rows(.id = 'univ_id')
toc()
## 5.034 sec elapsed
## frac_high_prestige indicates the fraction of permutations in which the program was high-prestige
ggplot(permutations, aes(frac_high_prestige)) +
geom_density() +
geom_rug()
univ_df = left_join(univ_df, permutations)
## Joining, by = "univ_id"
Finding: Counterfactual high-prestige status is strongly correlated with centrality ranking.
ggplot(univ_df, aes(log10(out_centrality), frac_high_prestige)) +
geom_point(aes(text = univ_name)) +
geom_smooth(method = 'lm')
## Warning: Ignoring unknown aesthetics: text
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 304 rows containing non-finite values (stat_smooth).
## Warning: Removed 304 rows containing missing values (geom_point).
plotly::ggplotly()
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 304 rows containing non-finite values (stat_smooth).
ggplot(univ_df, aes(perm_placements, frac_high_prestige, color = prestige)) +
geom_point(aes(text = univ_name)) +
geom_smooth(method = 'lm')
## Warning: Ignoring unknown aesthetics: text
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 1039 rows containing non-finite values (stat_smooth).
## Warning: Removed 1039 rows containing missing values (geom_point).
plotly::ggplotly()
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 1039 rows containing non-finite values (stat_smooth).
Finding: For low-prestige programs, counterfactual prestige seems to depend on the extent of the program’s downstream hiring network. Compare Boston College (19 permanent placements; 40% high prestige) to Leuven (18 permanent placements; 20% high prestige).
focal_nodes = hiring_network %>%
as_tibble() %>%
filter(univ_name %in% c('Villanova University', 'Katholieke Universiteit Leuven')) %>%
pull(name)
focal_nodes_net = make_ego_graph(hiring_network, order = 10,
nodes = focal_nodes,
mode = 'out') %>%
map(as_tbl_graph) %>%
reduce(bind_graphs) %>%
mutate(perm_placements = ifelse(is.na(perm_placements), 0, perm_placements))
focal_nodes_layout = create_layout(focal_nodes_net, 'stress')
ggraph(focal_nodes_layout) +
geom_node_label(aes(label = univ_name,
size = perm_placements,
fill = perm_placements),
color = 'white') +
geom_edge_parallel(arrow = arrow(angle = 45,
length = unit(.1, 'inches'),
type = 'closed'),
alpha = .3) +
scale_size_continuous(range = c(1, 5),
# na.value = 1,
name = 'placements',
guide = 'none') +
scale_fill_viridis(na.value = 'black', name = 'permanent\nplacements') +
theme_graph()
ggsave(str_c(plots_path, 'focal_nodes.png'),
height = 6, width = 10, dpi = 600, scale = 1.25)
ggsave(str_c(paper_folder, 'fig_focal_nodes.png'),
height = 6, width = 10, dpi = 600, scale = 1.25)
## plotting ----
Coarser community structure
# large_comms = which(sizes(communities) > 20)
# V(hiring_network_gc)$community_coarse = ifelse(
# V(hiring_network_gc)$community %in% large_comms,
# V(hiring_network_gc)$community,
# NA)
## Big giant hairy ball
## ~13 sec
# tic()
# layout_net = hiring_network %>%
# layout_with_stress() %>%
# `colnames<-`(c('x', 'y')) %>%
# as_tibble()
# toc()
## Focus on Oxford
## GC only; ~7 sec
# tic()
# layout_net = create_layout(hiring_network_gc, 'focus', focus = 512)
# toc()
## Stress majorization
## ~2 sec
tic()
layout_net = create_layout(hiring_network_gc, 'stress')
toc()
## 0.516 sec elapsed
layout_net %>%
## NB neither of these preserve the layout_tbl_graph class, which breaks things when we get to ggraph()
# filter(total_placements > 0) %>%
# induced_subgraph(which(degree(., mode = 'out') > 0)) %>%
ggraph() +
geom_edge_parallel(arrow = arrow(length = unit(.02, 'npc'),
angle = 15,
type = 'closed'),
# spread = 5,
alpha = .05) +
geom_node_point(aes(#size = log10(out_centrality),
alpha = log10(out_centrality),
# color = as.factor(community)
color = cluster_label
# color = log10(out_centrality)
), size = 2) +
# scale_color_brewer(palette = 'Set1', na.value = 'black') +
scale_color_viridis_d(na.value = 'black', name = 'Semantic cluster') +
# scale_size_discrete(range = c(2, 6)) +
scale_alpha_continuous(range = c(.2, 1),
name = 'Out centrality (log10)') +
# scale_size_continuous(range = c(1, 3)) +
theme_graph()
ggsave(str_c(plots_path, 'hairball.png'),
width = 12, height = 12)
ggsave(str_c(paper_folder, 'fig_hairball.png'),
width = 12, height = 12)
# hiring_network_gc %>%
# induced_subgraph(!is.na(V(.)$cluster_lvl1)) %>%
# ggraph() +
# geom_node_label(aes(size = prestigious,
# label = univ_name,
# # color = as.factor(community))) +
# fill = cluster_lvl4), color = 'black') +
# geom_edge_fan(aes(linetype = aos_category, color = aos_category),
# width = 1,
# arrow = arrow(length = unit(.01, 'npc')),
# spread = 5) +
# scale_edge_color_brewer(palette = 'Set1') +
# scale_fill_brewer(palette = 'Set3', na.value = 'grey75') +
# scale_size_discrete(range = c(3, 5)) +
# theme_graph()
## Chord diagram
# hiring_network_gc %>%
# # induced_subgraph(which(degree(., mode = 'out') > 0)) %>%
# ggraph(layout = 'linear', sort.by = 'community_coarse', circular = TRUE) +
# geom_edge_arc(arrow = arrow(length = unit(.01, 'npc')), alpha = .1) +
# geom_node_point(aes(size = prestigious,
# # color = as.factor(community))) +
# color = cluster_lvl4)) +
# scale_color_brewer(palette = 'Set1', guide = 'none') +
# theme_graph()
## Separate networks for each cluster
# cluster_ids = hiring_network %>%
# as_tibble() %>%
# split(., .$cluster_lvl4) %>%
# map(pull, name)
#
# cluster_ids %>%
# map(~induced_subgraph(hiring_network, .))
# hiring_network %>%
# # filter(!is.na(cluster_lvl4)) %>%
# ggraph(layout = 'manual',
# node.positions = layout_net) +
# geom_edge_fan(alpha = .1) +
# geom_node_point(aes(color = cluster_label), show.legend = TRUE) +
# # facet_nodes(vars(cluster_lvl4)) +
# scale_color_viridis_d(na.value = 'grey90') +
# theme_graph()
## output ----
Save university-level data with network statistics
write_rds(univ_df, str_c(data_folder, '03_univ_net_stats.rds'))
sessionInfo()
## R version 4.1.0 (2021-05-18)
## Platform: x86_64-apple-darwin17.0 (64-bit)
## Running under: macOS Big Sur 10.16
##
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/4.1/Resources/lib/libRblas.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/4.1/Resources/lib/libRlapack.dylib
##
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] tictoc_1.0.1 assertthat_0.2.1 ggbeeswarm_0.6.0 ggforce_0.3.3
## [5] broom_0.7.9 ggraph_2.0.5 tidygraph_1.2.0 igraph_1.2.6
## [9] forcats_0.5.1 stringr_1.4.0 dplyr_1.0.7 purrr_0.3.4
## [13] readr_2.0.1 tidyr_1.1.3 tibble_3.1.4 ggplot2_3.3.5
## [17] tidyverse_1.3.1
##
## loaded via a namespace (and not attached):
## [1] nlme_3.1-152 fs_1.5.0 lubridate_1.7.10 RColorBrewer_1.1-2
## [5] httr_1.4.2 tools_4.1.0 backports_1.2.1 bslib_0.2.5.1
## [9] utf8_1.2.2 R6_2.5.1 vipor_0.4.5 lazyeval_0.2.2
## [13] DBI_1.1.1 mgcv_1.8-36 colorspace_2.0-2 withr_2.4.2
## [17] tidyselect_1.1.1 gridExtra_2.3 compiler_4.1.0 cli_3.0.1
## [21] rvest_1.0.1 xml2_1.3.2 plotly_4.9.4.1 labeling_0.4.2
## [25] sass_0.4.0 scales_1.1.1 digest_0.6.27 rmarkdown_2.9
## [29] pkgconfig_2.0.3 htmltools_0.5.1.1 dbplyr_2.1.1 highr_0.9
## [33] htmlwidgets_1.5.3 rlang_0.4.11 readxl_1.3.1 rstudioapi_0.13
## [37] jquerylib_0.1.4 farver_2.1.0 generics_0.1.0 jsonlite_1.7.2
## [41] crosstalk_1.1.1 magrittr_2.0.1 Matrix_1.3-4 Rcpp_1.0.7
## [45] munsell_0.5.0 fansi_0.5.0 viridis_0.6.1 lifecycle_1.0.0
## [49] stringi_1.7.4 yaml_2.2.1 MASS_7.3-54 grid_4.1.0
## [53] ggrepel_0.9.1 crayon_1.4.1 lattice_0.20-44 graphlayouts_0.7.1
## [57] haven_2.4.1 splines_4.1.0 hms_1.1.0 knitr_1.33
## [61] pillar_1.6.2 codetools_0.2-18 reprex_2.0.0 glue_1.4.2
## [65] evaluate_0.14 data.table_1.14.0 modelr_0.1.8 vctrs_0.3.8
## [69] tzdb_0.1.2 tweenr_1.0.2 cellranger_1.1.0 gtable_0.3.0
## [73] polyclip_1.10-0 xfun_0.24 viridisLite_0.4.0 beeswarm_0.4.0
## [77] ellipsis_0.3.2